1. Identification of PAR genes
1.1 PAR genes in zebra finch
###############################################################################
# DOWNLOAD EXTERNAL GENOME DATA #
###############################################################################
# Latest zebra finch genome and annotations from NCBI
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/008/822/105/GCF_008822105.2_bTaeGut2.pat.W.v2/GCF_008822105.2_bTaeGut2.pat.W.v2_genomic.fna.gz
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/008/822/105/GCF_008822105.2_bTaeGut2.pat.W.v2/GCF_008822105.2_bTaeGut2.pat.W.v2_genomic.gtf.gz
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/008/822/105/GCF_008822105.2_bTaeGut2.pat.W.v2/GCF_008822105.2_bTaeGut2.pat.W.v2_cds_from_genomic.fna.gz
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/008/822/105/GCF_008822105.2_bTaeGut2.pat.W.v2/GCF_008822105.2_bTaeGut2.pat.W.v2_genomic.gff.gz
###############################################################################
# EXTRACT CHROMOSOME Z SEQUENCE #
###############################################################################
gunzip *.gz
samtools faidx GCF_008822105.2_bTaeGut2.pat.W.v2_genomic.fna
samtools faidx GCF_008822105.2_bTaeGut2.pat.W.v2_genomic.fna NC_045027.1 > GCF_008822105.2_bTaeGut2.pat.W.v2_genomic.chrZ.fasta
###############################################################################
# CLEAN AND PROCESS GTF FILE #
###############################################################################
zebra_gtf="GCF_008822105.2_bTaeGut2.pat.W.v2_genomic.gtf"
gtf_clean="GCF_008822105.2_bTaeGut2.pat.W.v2_genomic.small.clean.gtf"
cat "$zebra_gtf" |
sed 's/exon_number/\texon_number/' |
sed 's/protein_id/\tprotein_id/' |
sed 's/;/\t/2' |
cut -f1-9,11,12 |
sed -e 's/\t/; /9' -e 's/\t//9' > "$gtf_clean"
###############################################################################
# FILTER LONGEST TRANSCRIPT PER GENE #
###############################################################################
module load bioinfo-tools CGAT/0.3.3
source /sw/apps/bioinfo/CGAT/0.3.3/rackham/conda-install/etc/profile.d/conda.sh
conda activate base; conda activate cgat-s
gtf_longest="GCF_008822105.2_bTaeGut2.pat.W.v2_genomic.small.clean.longestTranscript.gtf"
grep -v "unknown" "$gtf_clean" |
cgat gtf2gtf --method=filter --filter-method=longest-transcript > "$gtf_longest"
###############################################################################
# GENERATE EXON FASTA FOR LONGEST TRANSCRIPTS #
###############################################################################
gffread -g GCF_008822105.2_bTaeGut2.pat.W.v2_genomic.fna -w longestTranscripts.exons.fa -x longestTranscripts.exons.separate.fa "$gtf_longest"
###############################################################################
# EXTRACT PAR AND CHROMOSOME Z GENES #
###############################################################################
chrZ="NC_045027.1"
# Genes in pseudoautosomal region (PAR) on Z
grep "$chrZ" "$gtf_longest" |
awk '$3 == "exon" && $5 < 550000' |
cut -f9 |
awk '{print $2,$4}' |
tr -d ';"' |
sort -u |
sed 's/ /\t/' |
cut -f2 > ZF_PAR.genes.new.list
# Gene and transcript IDs in PAR
grep "$chrZ" "$gtf_longest" |
awk '$3 == "exon" && $5 < 550000' |
cut -f9 |
awk '{print $2,$4}' |
tr -d ';"' |
sort -u |
sed 's/ /\t/' > PAR_genes_geneID_transID.list
# All Z chromosome genes
grep "$chrZ" "$gtf_longest" |
awk '$3 == "exon"' |
cut -f9 |
awk '{print $2,$4}' |
tr -d ';"' |
sort -u |
sed 's/ /\t/' |
cut -f2 > ZF_Z_chr.genes.list
# All Z genes with gene and transcript ID
grep "$chrZ" "$gtf_longest" |
awk '$3 == "exon"' |
cut -f9 |
awk '{print $2,$4}' |
tr -d ';"' |
sort -u |
sed 's/ /\t/' > ZF_Z_chr.genes_geneID_transID.list
# W-linked PAR genes
grep "NW_022611471.1" "$gtf_longest" |
cut -f9 |
awk '{print $2,$4}' |
tr -d ';"' |
sort -u |
sed 's/ /\t/' > W_PAR_genes_geneID_transID.list
# Make table with gene names
cat <<EOF > W_PAR_genes_geneID_transID_withGeneName.list
NEDD4L LOC100217943 XM_030259050.2
ZNF532 LOC100218930 XM_030258323.2
ATP8B1 LOC100220790 XM_030259059.2
FECH LOC100223728 XM_032744869.1
MALT1 LOC100224698 XM_012577803.3
ALPK2 LOC100227584 XM_030258318.2
ST8SIA3 LOC100229524 XM_030257523.2
WDR7 LOC100232465 XM_030258950.2
NARS1 LOC116806604 XM_032744866.1
TXNL1 LOC116807012 XM_032744865.1
ONECUT2 LOC116807017 XR_004366058.1
LMAN1 LOC116807022 XM_032744872.1
RAX LOC116807023 XM_032744877.1
GRP LOC116807024 XM_032744878.1
SEC11C SEC11C XM_032744862.1
EOF
cat <<EOF | awk -F',' '{OFS="\t"; print $1, $2, $3}' > W_PAR_genes_geneID_transID_withGeneName.updated.list
LMAN1,LOC116807022,XM_032744872.1
uncharacterized2,LOC115491185,XR_004366070.1
RAX,LOC116807023,XM_032744877.1
GRP,LOC116807024,XM_032744878.1
SEC11C,SEC11C,XM_032744862.1
ZNF532,LOC100218930,XM_030258323.2
MALT1,LOC100224698,XM_012577803.3
ALPK2,LOC100227584,XM_030258318.2
uncharacterized3,LOC116807018,XR_004366060.1
NEDD4L,LOC100217943,XM_030259050.2
uncharacterized4,LOC116807014,XR_004366054.1
ATP8B1,LOC100220790,XM_030259059.2
NARS1,LOC116806604,XM_032744866.1
FECH,LOC100223728,XM_032744869.1
ONECUT2,LOC116807017,XR_004366058.1
ST8SIA3,LOC100229524,XM_030257523.2
WDR7,LOC100232465,XM_030258950.2
TXNL1,LOC116807012,XM_032744865.1
EOF
# Make table with gene names including uncharacterized genes
cat <<EOF | awk -F',' '{OFS="\t"; print $1, $2, $3}' > PAR_genes_geneID_transID_geneName.list
uncharacterized1,LOC116806781,XR_004365834.1
LMAN1,LOC100220399,XM_030258642.2
uncharacterized2,LOC116806731,XR_004365759.1
RAX,RAX,NM_001243734.2
GRP,LOC105760884,XM_030258647.2
SEC11C,LOC116806596,XM_030257196.2
ZNF532,LOC116806602,XM_032744080.1
MALT1,LOC116806749,XM_032744259.1
ALPK2,LOC116806817,XM_032744489.1
uncharacterized3,LOC116806818,XR_004365896.1
MIR122,MIR122,NR_049051.1
NEDD4L,LOC116806603,XM_032744495.1
uncharacterized4,LOC115491286,XR_003957248.2
uncharacterized5,LOC116806819,XR_004365897.1
ATP8B1,LOC116806743,XM_032744197.1
NARS1,NARS1,XM_030259060.2
FECH,LOC116806605,XM_030259064.2
ONECUT2,LOC100226718,XR_003957244.2
ST8SIA3,LOC116806742,XM_032744196.1
uncharacterized6,LOC115491263,XR_003957222.2
WDR7,LOC116806851,XM_032744613.1
TXNL1,LOC100218989,XM_032744615.1
uncharacterized7,LOC116806852,XR_004365928.1
EOF
###############################################################################
# EXTRACT W PAR TRANSCRIPTS #
###############################################################################
oneline_fasta() {
awk '/^>/ {printf("\n%s\n",$0);next;} { printf("%s",$0);} END {printf("\n");}' "$1"
}
gtf_path="GCF_008822105.2_bTaeGut2.pat.W.v2_genomic.small.clean.longestTranscript.gtf"
exon_fasta="longestTranscripts.exons.fa"
grep "NW_022611471.1" "$gtf_path" |
awk '{print $12}' |
sort -u |
tr -d '";' |
while read gene; do
oneline_fasta "$exon_fasta" | grep "$gene" -A1
done > longestTranscripts.exons.PAR.W.fa
###############################################################################
# BLAST SEARCH #
###############################################################################
module load bioinfo-tools blast
makeblastdb -in "$exon_fasta" -parse_seqids -dbtype nucl
blastn \
-db "$exon_fasta" \
-query longestTranscripts.exons.PAR.W.fa \
-outfmt 6 \
> zebrafinch_PAR_W_blast_results.out
1.2 Synteny analyses to the Sylvioidea reference genomes
###############################################################################
# METADATA #
###############################################################################
cat <<EOF | awk -F' ' '{OFS="\t"; print $1, $2}' > samples_ref_genome.list
AcrSch Acrocephalus_schoenobaenus
AegCau Aegithalos_caudatus
AlaArv skylark_min1kb
CetCet Cettia_cetti
CisJun Cisticola_juncidis
HirDau Hirundo_daurica
LocLus Locustella_luscinioides
PanBia panurus_min1kb
PhyCol Phylloscopus_collybita
PycBar Pycnonotus_barbatus
SylAtr_1EV02922 Sylvia_atricapilla_1EV02922
SylBra Sylvietta_brachyura
TurAlt Turdoides_altirostris
EreAlp horned_lark_min1kb
EOF
cat <<EOF | awk -F',' '{OFS="\t"; print $1, $2, $3}' > samples_sex_sameline.tsv
QF-1504-CP59475_S11_L004,QF-1504-BL37630_S12_L004,AegCau
QL-1681-19_S46_L006,QL-1681-21_S47_L006,AlaArv
QF-1504-P182137_S9_L003,QF-1504-2L18122_S4_L002,CetCet
QF-1504-CISJUN-2_S6_L002,QF-1504-RA5680_S5_L002,CisJun
QF-1504-P182141_S2_L001,QF-1504-P182142_S1_L001,HirDau
QF-1504-LOCLUS-43_S1_L001,QF-1504-LOCLUS-24_S3_L001,LocLus
QF-1504-2KR32024_S2_L001,QF-1504-1ET92164_S3_L001,PanBia
QF-1504-R86159_S5_L002,QF-1504-Z81303_S4_L002,PhyCol
1EL38952_S2_L001,1EV02922_S4_L002,SylAtr_1EV02922
QF-1504-CT90325_S17_L006,QF-1504-CT90312_S18_L006,AcrSch
QF-1504-H-19_S8_L003,QF-1504-H-88_S7_L003,EreAlp
SJ-2333-IB-2b_S32_L002,SJ-2333-IB-1a_S31_L002,TurAlt
SJ-2333-Pbar-197_S24_L002,SJ-2333-Pbar-421_S22_L002,PycBar
SJ-2333-Sbra-553_S28_L002,SJ-2333-Sbra-878_S26_L002,SylBra
EOF
while read -r sp ref; do
awk -v species="$sp" -v refgen="$ref" '$3 == species { print $0, refgen }' \
samples_sex_sameline.tsv
done < samples_ref_genome.list |
sed 's/ /\t/g' > samples_sex_sameline_ref.tsv#### Kraken base configuration template (kraken_base_config_bTaeGut2.pat.W.v2):
[genomes]
sylvioidea QUERY.fasta
zf GCF_008822105.2_bTaeGut2.pat.W.v2_genomic.chrZ.fasta
[pairwise-maps]
zf sylvioidea satsuma/bTaeGut1.pat.W.v2/SPECIES/satsuma_summary.chained.out
#### Kraken base script (kraken_Z_base.sh)
#! /bin/bash -l
#
#SBATCH -p core -n 2
#SBATCH -t 4:00:00
#SBATCH -A snic2020-5-33 -J kraken_SPECIES
target="GCF_008822105.2_bTaeGut2.pat.W.v2_genomic.chrZ.fasta"
query="QUERY"
output="satsuma/bTaeGut1.pat.W.v2/SPECIES"
mkdir -p kraken/bTaeGut1.pat.W.v2/SPECIES
rm kraken/bTaeGut1.pat.W.v2/SPECIES/mapped.gtf
~/bin/kraken/bin/RunKraken -c kraken_SPECIES_config_bTaeGut2.pat.W.v2 -s GCF_008822105.2_bTaeGut2.pat.W.v2_genomic.small.clean.gtf -S zf -T sylvioidea -o kraken/bTaeGut1.pat.W.v2/SPECIES/mapped.gtf
#### Prepare Kraken config for each species
cat samples_ref_genome.list | \
while read sp ref; do
cat kraken_base_config_bTaeGut2.pat.W.v2 | sed "s/SPECIES/${sp}/" | sed "s|QUERY|${ref}|" > kraken_${sp}_config_bTaeGut2.pat.W.v2
done
#### Satsuma + kraken base script (satsuma_Z_base.bTaeGut2.pat.W.v2.sbatch)
#! /bin/bash -l
#
#SBATCH -p core -n 18
#SBATCH -t 1-12:00:00
#SBATCH -A snic2020-5-33 -J satsuma_SPECIES
target="GCF_008822105.2_bTaeGut2.pat.W.v2_genomic.chrZ.fasta"
query="QUERY"
output="satsuma/bTaeGut1.pat.W.v2/SPECIES"
~/bin/satsuma-code-0/SatsumaSynteny -t $target -q $query -o $output -n 17
mkdir -p kraken/bTaeGut1.pat.W.v2/SPECIES
~/bin/kraken/bin/RunKraken -c kraken_SPECIES_config_bTaeGut2.pat.W.v2 -s GCF_008822105.2_bTaeGut2.pat.W.v2_genomic.edit.gtf -S zf -T sylvioidea -o kraken/bTaeGut1.pat.W.v2/SPECIES/mapped.gtf
#### Run analyses
cat samples_ref_genome.list | \
while read sp ref; do
cat satsuma_Z_base.bTaeGut2.pat.W.v2.sbatch | sed "s/SPECIES/${sp}/g" | sed "s|QUERY|${ref}.fasta|g" > satsuma_Z_${sp}.bTaeGut2.pat.W.v2.sbatch
sbatch satsuma_Z_${sp}.bTaeGut2.pat.W.v2.sbatch
done1.3 Synteny analysis to the flycatcher genome
#! /bin/bash -l
#
#SBATCH -p core -n 2
#SBATCH -t 4:00:00
#SBATCH -A snic2020-5-33 -J satsuma_FicAlb
target="data/external_raw/genome/GCF_008822105.2_bTaeGut2.pat.W.v2_genomic.chrZ.fasta"
query="data/external_raw/genome/FicAlb1.5.editNames.fasta"
output="intermediate/satsuma/bTaeGut1.pat.W.v2/FicAlb"
/home/hannas/bin/satsuma-code-0/SatsumaSynteny -t $target -q $query -o $output -n 17
mkdir -p intermediate/kraken/bTaeGut1.pat.W.v2/FicAlb
~/bin/kraken/bin/RunKraken -c code/kraken_FicAlb_config_bTaeGut2.pat.W.v2 -s data/external_raw/genome/GCF_008822105.2_bTaeGut2.pat.W.v2_genomic.small.clean.gtf -S zf -T sylvioidea -o intermediate/kraken/bTaeGut1.pat.W.v2/FicAlb/mapped.gtf